Adding some more judges, here and there.
[andmenj-acm.git] / lib / Mi manual de algoritmos / version_actual / src / strings / aho-corasick.cpp
blob95cee46793449fea311e1b61d596cbebed289f06
1 ///////////////////////////////////////////////////////////////
2 // Aho-Corasick's algorithm, as explained in //
3 // http://dx.doi.org/10.1145/360825.360855 //
4 ///////////////////////////////////////////////////////////////
6 // Max number of states in the matching machine.
7 // Should be equal to the sum of the length of all keywords.
8 const int MAXS = 6 * 50 + 10;
10 // Number of characters in the alphabet.
11 const int MAXC = 26;
13 // Output for each state, as a bitwise mask.
14 // Bit i in this mask is on if the keyword with index i
15 // appears when the machine enters this state.
16 int out[MAXS];
18 // Used internally in the algorithm.
19 int f[MAXS]; // Failure function
20 int g[MAXS][MAXC]; // Goto function, or -1 if fail.
22 // Builds the string matching machine.
23 //
24 // words - Vector of keywords. The index of each keyword is
25 // important:
26 // "out[state] & (1 << i)" is > 0 if we just found
27 // word[i] in the text.
28 // lowestChar - The lowest char in the alphabet.
29 // Defaults to 'a'.
30 // highestChar - The highest char in the alphabet.
31 // Defaults to 'z'.
32 // "highestChar - lowestChar" must be <= MAXC,
33 // otherwise we will access the g matrix outside
34 // its bounds and things will go wrong.
36 // Returns the number of states that the new machine has.
37 // States are numbered 0 up to the return value - 1, inclusive.
38 int buildMatchingMachine(const vector<string> &words,
39 char lowestChar = 'a',
40 char highestChar = 'z') {
41 memset(out, 0, sizeof out);
42 memset(f, -1, sizeof f);
43 memset(g, -1, sizeof g);
45 int states = 1; // Initially, we just have the 0 state
47 for (int i = 0; i < words.size(); ++i) {
48 const string &keyword = words[i];
49 int currentState = 0;
50 for (int j = 0; j < keyword.size(); ++j) {
51 int c = keyword[j] - lowestChar;
52 if (g[currentState][c] == -1) {
53 // Allocate a new node
54 g[currentState][c] = states++;
56 currentState = g[currentState][c];
58 // There's a match of keywords[i] at node currentState.
59 out[currentState] |= (1 << i);
62 // State 0 should have an outgoing edge for all characters.
63 for (int c = 0; c < MAXC; ++c) {
64 if (g[0][c] == -1) {
65 g[0][c] = 0;
69 // Now, let's build the failure function
70 queue<int> q;
71 // Iterate over every possible input
72 for (int c = 0; c <= highestChar - lowestChar; ++c) {
73 // All nodes s of depth 1 have f[s] = 0
74 if (g[0][c] != -1 and g[0][c] != 0) {
75 f[g[0][c]] = 0;
76 q.push(g[0][c]);
79 while (q.size()) {
80 int state = q.front();
81 q.pop();
82 for (int c = 0; c <= highestChar - lowestChar; ++c) {
83 if (g[state][c] != -1) {
84 int failure = f[state];
85 while (g[failure][c] == -1) {
86 failure = f[failure];
88 failure = g[failure][c];
89 f[g[state][c]] = failure;
91 // Merge out values
92 out[g[state][c]] |= out[failure];
93 q.push(g[state][c]);
98 return states;
101 // Finds the next state the machine will transition to.
103 // currentState - The current state of the machine. Must be
104 // between 0 and the number of states - 1,
105 // inclusive.
106 // nextInput - The next character that enters into the machine.
107 // Should be between lowestChar and highestChar,
108 // inclusive.
109 // lowestChar - Should be the same lowestChar that was passed
110 // to "buildMatchingMachine".
112 // Returns the next state the machine will transition to.
113 // This is an integer between 0 and the number of states - 1,
114 // inclusive.
115 int findNextState(int currentState, char nextInput,
116 char lowestChar = 'a') {
117 int answer = currentState;
118 int c = nextInput - lowestChar;
119 while (g[answer][c] == -1) answer = f[answer];
120 return g[answer][c];
124 // How to use this algorithm:
126 // 1. Modify the MAXS and MAXC constants as appropriate.
127 // 2. Call buildMatchingMachine with the set of keywords to
128 // search for.
129 // 3. Start at state 0. Call findNextState to incrementally
130 // transition between states.
131 // 4. Check the out function to see if a keyword has been
132 // matched.
134 // Example:
136 // Assume keywords is a vector that contains
137 // {"he", "she", "hers", "his"} and text is a string that
138 // contains "ahishers".
140 // Consider this program:
142 // buildMatchingMachine(keywords, 'a', 'z');
143 // int currentState = 0;
144 // for (int i = 0; i < text.size(); ++i) {
145 // currentState = findNextState(currentState, text[i], 'a');
147 // Nothing new, let's move on to the next character.
148 // if (out[currentState] == 0) continue;
150 // for (int j = 0; j < keywords.size(); ++j) {
151 // if (out[currentState] & (1 << j)) {
152 // //Matched keywords[j]
153 // cout << "Keyword " << keywords[j]
154 // << " appears from "
155 // << i - keywords[j].size() + 1
156 // << " to " << i << endl;
157 // }
158 // }
159 // }
161 // The output of this program is:
163 // Keyword his appears from 1 to 3
164 // Keyword he appears from 4 to 5
165 // Keyword she appears from 3 to 5
166 // Keyword hers appears from 4 to 7
168 ///////////////////////////////////////////////////////////////
169 // End of Aho-Corasick's algorithm //
170 ///////////////////////////////////////////////////////////////